Skip to content
标签
爬取信息
字数
381 字
阅读时间
3 分钟

一、概述

二、使用示例

2.1 入门Demo

依赖

xml
<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-core</artifactId>
	<version>0.7.3</version>
</dependency>
<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-extension</artifactId>
	<version>0.7.3</version>
</dependency>

代码实现 爬出实现

java
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

import java.util.List;

public class LianjiaPageProcessor implements PageProcessor {

    private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);

    @Override
    public void process(Page page) {
        Html html = page.getHtml();

        // 房源详情连接
        List<String> list = html.css(".content__list--item--title a").links().all();
        page.addTargetRequests(list);

        String title = html.xpath("//div[@class='content clear w1150']/p/text()").toString();
        page.putField("title", title);

        page.putField("rent", html.xpath("//p[@class='content__aside--title']/span/text()").toString());
        page.putField("type", html.xpath("//p[@class='content__article__table']/allText()").toString());
        page.putField("info", html.xpath("//div[@class='content__article__info']/allText()").toString());
        page.putField("img", html.xpath("//div[@class='content__article__slide__item']/img").toString());

        if(page.getResultItems().get("title") == null){
            page.setSkip(true);

            // 分页连接
            for (int i = 1; i <= 100; i++) {
                page.addTargetRequest("https://sh.lianjia.com/zufang/pg"+i);
            }
        }

    }

    @Override
    public Site getSite() {
        return site;
    }

    public static void main(String[] args) {
        Spider.create(new LianjiaPageProcessor())
                .addUrl("https://sh.lianjia.com/zufang/")
                .addPipeline(new MyPipeline())
                .thread(1).run();
    }
}

爬取数据处理

java
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClientBuilder;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.File;
import java.util.HashMap;
import java.util.Map;

public class MyPipeline implements Pipeline {

    private static final ObjectMapper MAPPER = new ObjectMapper();

    @Override
    public void process(ResultItems resultItems, Task task) {
        Map<String, Object> data = new HashMap<>();

        data.put("url", resultItems.getRequest().getUrl());
        data.put("title", resultItems.get("title"));//标题
        data.put("rent", resultItems.get("rent"));//租金

        String[] types = StringUtils.split(resultItems.get("type"), ' ');
        data.put("rentMethod", types[0]);//租赁方式
        data.put("houseType", types[1]);//户型,如:2室1厅1卫
        data.put("orientation", types[2]);//朝向

        String[] infos = StringUtils.split(resultItems.get("info"), ' ');
        for (String info : infos) {
            if (StringUtils.startsWith(info, "看房:")) {
                data.put("time", StringUtils.split(info, ':')[1]);
            } else if (StringUtils.startsWith(info, "楼层:")) {
                data.put("floor", StringUtils.split(info, ':')[1]);
            }
        }

        String imageUrl = StringUtils.split(resultItems.get("img"), '"')[3];
        String newName = StringUtils
                .substringBefore(StringUtils
                        .substringAfterLast(resultItems.getRequest().getUrl(),
                                "/"), ".") + ".jpg";
        try {
            this.downloadFile(imageUrl, new File("F:\\code\\images\\" + newName));
            data.put("image", newName);

            String json = MAPPER.writeValueAsString(data);

            FileUtils.write(new File("F:\\code\\data.json"), json + "\n", "UTF-8",
                    true);
        } catch (Exception e) {
            e.printStackTrace();
        }

    }

    /**
     * 下载文件
     *
     * @param url 文件url
     * @param dest 目标目录
     * @throws Exception
     */
    public void downloadFile (String url, File dest) throws Exception {
        HttpGet httpGet = new HttpGet(url);
        CloseableHttpResponse response =
                HttpClientBuilder.create().build().execute(httpGet);
        try {
            FileUtils.writeByteArrayToFile(dest,
                    IOUtils.toByteArray(response.getEntity().getContent()));
        } finally {
            response.close();
        }
    }
}